Import Dataset
data <- read.csv('HR_Employee_Data.csv')
Summary of the Data
head(data)
## ï..Emp_Id satisfaction_level last_evaluation number_project
## 1 IND02438 38% 53% 2
## 2 IND28133 80% 86% 5
## 3 IND07164 11% 88% 7
## 4 IND30478 72% 87% 5
## 5 IND24003 37% 52% 2
## 6 IND08609 41% 50% 2
## average_montly_hours time_spend_company Work_accident left
## 1 157 3 0 1
## 2 262 6 0 1
## 3 272 4 0 1
## 4 223 5 0 1
## 5 159 3 0 1
## 6 153 3 0 1
## promotion_last_5years Department salary
## 1 0 sales low
## 2 0 sales medium
## 3 0 sales medium
## 4 0 sales low
## 5 0 sales low
## 6 0 sales low
str(data)
## 'data.frame': 14999 obs. of 11 variables:
## $ ï..Emp_Id : chr "IND02438" "IND28133" "IND07164" "IND30478" ...
## $ satisfaction_level : chr "38%" "80%" "11%" "72%" ...
## $ last_evaluation : chr "53%" "86%" "88%" "87%" ...
## $ number_project : int 2 5 7 5 2 2 6 5 5 2 ...
## $ average_montly_hours : int 157 262 272 223 159 153 247 259 224 142 ...
## $ time_spend_company : int 3 6 4 5 3 3 4 5 5 3 ...
## $ Work_accident : int 0 0 0 0 0 0 0 0 0 0 ...
## $ left : int 1 1 1 1 1 1 1 1 1 1 ...
## $ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Department : chr "sales" "sales" "sales" "sales" ...
## $ salary : chr "low" "medium" "medium" "low" ...
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glimpse(data)
## Rows: 14,999
## Columns: 11
## $ ï..Emp_Id <chr> "IND02438", "IND28133", "IND07164", "IND30478", ~
## $ satisfaction_level <chr> "38%", "80%", "11%", "72%", "37%", "41%", "10%",~
## $ last_evaluation <chr> "53%", "86%", "88%", "87%", "52%", "50%", "77%",~
## $ number_project <int> 2, 5, 7, 5, 2, 2, 6, 5, 5, 2, 2, 6, 4, 2, 2, 2, ~
## $ average_montly_hours <int> 157, 262, 272, 223, 159, 153, 247, 259, 224, 142~
## $ time_spend_company <int> 3, 6, 4, 5, 3, 3, 4, 5, 5, 3, 3, 4, 5, 3, 3, 3, ~
## $ Work_accident <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ left <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ promotion_last_5years <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ Department <chr> "sales", "sales", "sales", "sales", "sales", "sa~
## $ salary <chr> "low", "medium", "medium", "low", "low", "low", ~
summary(data)
## ï..Emp_Id satisfaction_level last_evaluation number_project
## Length:14999 Length:14999 Length:14999 Min. :2.000
## Class :character Class :character Class :character 1st Qu.:3.000
## Mode :character Mode :character Mode :character Median :4.000
## Mean :3.803
## 3rd Qu.:5.000
## Max. :7.000
## average_montly_hours time_spend_company Work_accident left
## Min. : 96.0 Min. : 2.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:156.0 1st Qu.: 3.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :200.0 Median : 3.000 Median :0.0000 Median :0.0000
## Mean :201.1 Mean : 3.498 Mean :0.1446 Mean :0.2381
## 3rd Qu.:245.0 3rd Qu.: 4.000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :310.0 Max. :10.000 Max. :1.0000 Max. :1.0000
## promotion_last_5years Department salary
## Min. :0.00000 Length:14999 Length:14999
## 1st Qu.:0.00000 Class :character Class :character
## Median :0.00000 Mode :character Mode :character
## Mean :0.02127
## 3rd Qu.:0.00000
## Max. :1.00000
Check for Null Values
cbind(lapply(lapply(data, is.na), sum))
## [,1]
## ï..Emp_Id 0
## satisfaction_level 0
## last_evaluation 0
## number_project 0
## average_montly_hours 0
## time_spend_company 0
## Work_accident 0
## left 0
## promotion_last_5years 0
## Department 0
## salary 0
sum(is.na(data))
## [1] 0
Data Cleaning
data$satisfaction_level<-gsub("%","",as.character(data$satisfaction_level))
data$satisfaction_level=as.integer(data$satisfaction_level)
head(data)
## ï..Emp_Id satisfaction_level last_evaluation number_project
## 1 IND02438 38 53% 2
## 2 IND28133 80 86% 5
## 3 IND07164 11 88% 7
## 4 IND30478 72 87% 5
## 5 IND24003 37 52% 2
## 6 IND08609 41 50% 2
## average_montly_hours time_spend_company Work_accident left
## 1 157 3 0 1
## 2 262 6 0 1
## 3 272 4 0 1
## 4 223 5 0 1
## 5 159 3 0 1
## 6 153 3 0 1
## promotion_last_5years Department salary
## 1 0 sales low
## 2 0 sales medium
## 3 0 sales medium
## 4 0 sales low
## 5 0 sales low
## 6 0 sales low
data$last_evaluation<-gsub("%","",as.character(data$last_evaluation))
data$last_evaluation=as.integer(data$last_evaluation)
head(data)
## ï..Emp_Id satisfaction_level last_evaluation number_project
## 1 IND02438 38 53 2
## 2 IND28133 80 86 5
## 3 IND07164 11 88 7
## 4 IND30478 72 87 5
## 5 IND24003 37 52 2
## 6 IND08609 41 50 2
## average_montly_hours time_spend_company Work_accident left
## 1 157 3 0 1
## 2 262 6 0 1
## 3 272 4 0 1
## 4 223 5 0 1
## 5 159 3 0 1
## 6 153 3 0 1
## promotion_last_5years Department salary
## 1 0 sales low
## 2 0 sales medium
## 3 0 sales medium
## 4 0 sales low
## 5 0 sales low
## 6 0 sales low
1)Correlation plot
#install.packages("remotes")
#remotes::install_github("kmaheshkulkarni/corrly")
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.1
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(corrly)
matrixly(data[2:9])
## Warning in plotly::config(., displaylogo = FALSE, collaborate = FALSE): The
## collaborate button is no longer supported
## Warning: 'config' objects don't have these attributes: 'collaborate'
## Valid attributes include:
## 'autosizable', 'displaylogo', 'displayModeBar', 'doubleClick', 'doubleClickDelay', 'editable', 'edits', 'fillFrame', 'frameMargins', 'globalTransforms', 'linkText', 'locale', 'locales', 'logging', 'mapboxAccessToken', 'modeBarButtons', 'modeBarButtonsToAdd', 'modeBarButtonsToRemove', 'notifyOnLogging', 'plotGlPixelRatio', 'plotlyServerURL', 'queueLength', 'responsive', 'scrollZoom', 'sendData', 'setBackground', 'showAxisDragHandles', 'showAxisRangeEntryBoxes', 'showEditInChartStudio', 'showLink', 'showSendToCloud', 'showSources', 'showTips', 'staticPlot', 'toImageButtonOptions', 'topojsonURL', 'watermark'
## Warning: 'heatmap' objects don't have these attributes: 'marker'
## Valid attributes include:
## 'autocolorscale', 'coloraxis', 'colorbar', 'colorscale', 'connectgaps', 'customdata', 'customdatasrc', 'dx', 'dy', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hoverongaps', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'meta', 'metasrc', 'name', 'opacity', 'reversescale', 'showlegend', 'showscale', 'stream', 'text', 'textsrc', 'transforms', 'transpose', 'type', 'uid', 'uirevision', 'visible', 'x', 'x0', 'xaxis', 'xcalendar', 'xgap', 'xhoverformat', 'xperiod', 'xperiod0', 'xperiodalignment', 'xsrc', 'xtype', 'y', 'y0', 'yaxis', 'ycalendar', 'ygap', 'yhoverformat', 'yperiod', 'yperiod0', 'yperiodalignment', 'ysrc', 'ytype', 'z', 'zauto', 'zhoverformat', 'zmax', 'zmid', 'zmin', 'zsmooth', 'zsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
2) People who have left in each department
#install.packages('ecodist')
library(plotly)
library(dplyr)
library(ecodist)
## Warning: package 'ecodist' was built under R version 4.1.3
ans=crosstab(data$Department,data$left)
Department=rownames(ans)
fig <- plot_ly(ans,x = ~Department, y = ~X0, type = 'bar', name = 'Working for Company')
fig<- fig %>% add_trace(y =~X1, name = 'Left the Company')
fig <- fig %>% layout(title="Employees who have left based on department", yaxis = list(title = 'Count'), barmode = 'group')
fig
3) People who have left based on salary
#install.packages('ecodist')
library(plotly)
library(dplyr)
library(ecodist)
ans=crosstab(data$salary,data$left)
Salary=rownames(ans)
fig <- plot_ly(ans,x = ~Salary, y = ~X0, type = 'bar', name = 'Working for Company')
fig<- fig %>% add_trace(y =~X1, name = 'Left the Company')
fig <- fig %>% layout(title="Employees who have left based on salary", yaxis = list(title = 'Count'), barmode = 'group')
fig
4) Area plot of time spent in company compared with those who have nand have not left
ans=crosstab(data$time_spend_company,data$left)
ans
## X0 X1
## 2 3191 53
## 3 4857 1586
## 4 1667 890
## 5 640 833
## 6 509 209
## 7 188 0
## 8 162 0
## 10 214 0
Time_Spent=rownames(ans)
Time_Spent
## [1] "2" "3" "4" "5" "6" "7" "8" "10"
fig <- plot_ly(ans,x = ~Time_Spent, y = ~X0, type = 'scatter', mode = 'lines', name = 'Working for Company', fill = 'tozeroy')
fig <- fig %>% add_trace(y = ~X1, name = 'Left the Company', fill = 'tozeroy')
fig <- fig %>% layout(xaxis = list(title = 'Time Worked'),
yaxis = list(title = 'Count'))
fig
Naive Bayes
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 4.1.3
## naivebayes 0.9.7 loaded
library(dplyr)
library(ggplot2)
library(psych)
## Warning: package 'psych' was built under R version 4.1.3
##
## Attaching package: 'psych'
## The following object is masked from 'package:ecodist':
##
## distance
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
data$left<-as.factor(data$left)
#data$rank<-as.factor(data$rank)
str(data)
## 'data.frame': 14999 obs. of 11 variables:
## $ ï..Emp_Id : chr "IND02438" "IND28133" "IND07164" "IND30478" ...
## $ satisfaction_level : int 38 80 11 72 37 41 10 92 89 42 ...
## $ last_evaluation : int 53 86 88 87 52 50 77 85 100 53 ...
## $ number_project : int 2 5 7 5 2 2 6 5 5 2 ...
## $ average_montly_hours : int 157 262 272 223 159 153 247 259 224 142 ...
## $ time_spend_company : int 3 6 4 5 3 3 4 5 5 3 ...
## $ Work_accident : int 0 0 0 0 0 0 0 0 0 0 ...
## $ left : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Department : chr "sales" "sales" "sales" "sales" ...
## $ salary : chr "low" "medium" "medium" "low" ...
dataNB<-data[2:11]
#pairs.panels(data)
#cor(data$gre,data$gpa)
set.seed(234)
smpl<-sample(2,nrow(dataNB),replace=T,prob=c(0.8,0.2))
train<-dataNB[smpl==1,]
test<-dataNB[smpl==2, ]
mdl<-naive_bayes(left~ .,data=train)
#mdl
Accuracy : [1] 0.7914652
DT MODEL
#install.packages("rpart.plot")
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.3
## Loading required package: rpart
set.seed(234)
dataDT=data[2:11]
smpl<-sample(2,nrow(dataDT),replace=T,prob=c(0.8,0.2))
train<-dataDT[smpl==1,]
test<-dataDT[smpl==2, ]
fit <- rpart(left~., data = train, method = 'class')
rpart.plot(fit, extra = 106)

Prediction
predict_unseen <-predict(fit, test, type = 'class')
table_mat <- table(test$left, predict_unseen)
table_mat
## predict_unseen
## 0 1
## 0 2270 26
## 1 58 647
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
accuracy_Test
## [1] 0.9720093
Accuracy: [1] 0.9720093